{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ncjQuwOgT4W3"
      },
      "source": [
        "# 🗣️ Speech-AI-Forge Colab\n",
        "\n",
        "👋 This script is built on [Speech-AI-Forge](https://github.com/lenML/Speech-AI-Forge). If this project helps you, feel free to support us with a star on GitHub! Contributions via PRs and issues are also welcome~\n",
        "\n",
        "## Usage Guide\n",
        "\n",
        "1. Select **Runtime** from the menu.\n",
        "2. Click **Run All**.\n",
        "\n",
        "Once the process is complete, look for the following information in the log:\n",
        "\n",
        "```\n",
        "Running on public URL: https://**.gradio.live\n",
        "```\n",
        "\n",
        "This link will be the public URL you can access.\n",
        "\n",
        "> Note: If prompted to restart during package installation, please select \"No.\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fvzfP6Tn8nOL"
      },
      "source": [
        "## Environment"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "09UPJy1gP045",
        "outputId": "079a9d2c-54aa-4ff5-a969-70e27029f3ed"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "# Skip restarting message in Colab\n",
        "import sys; modules = list(sys.modules.keys())\n",
        "for x in modules: sys.modules.pop(x) if \"PIL\" in x or \"google\" in x else None\n",
        "\n",
        "# 1. Clone the repository\n",
        "!git clone https://github.com/lenML/Speech-AI-Forge\n",
        "\n",
        "# 2. Change directory to the repository\n",
        "%cd Speech-AI-Forge\n",
        "\n",
        "# 3. Install ffmpeg and rubberband-cli\n",
        "!apt-get update -y\n",
        "!apt-get install -y ffmpeg rubberband-cli\n",
        "\n",
        "# 4. Install dependencies\n",
        "!pip install -r requirements.txt\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hLupWhOXT1kt"
      },
      "source": [
        "## Download Models"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gZBGws7c8nON",
        "outputId": "73de300c-da8e-4fd5-ea37-73cfecd2f2f7"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "from scripts import dl_chattts, dl_enhance\n",
        "from scripts.downloader import fish_speech_1_4, cosyvoice2, fire_red_tts, f5_tts, vocos_mel_24khz, faster_whisper, open_voice\n",
        "\n",
        "# @markdown ## Model Download Instructions\n",
        "# @markdown Most models are close to 2GB in size, please ensure you have sufficient storage space and network bandwidth. <br/>\n",
        "# @markdown **Note**: You must select at least one TTS model. If none are selected, ChatTTS will be downloaded by default.\n",
        "\n",
        "# @markdown ### Hugging Face Token (Optional)\n",
        "# @markdown Some models may require a configured Hugging Face Token to download. If you need to use these models, enter your Token here. You can get your Token from [Hugging Face](https://huggingface.co/settings/tokens).\n",
        "HF_TOKEN = \"\" #@param {type:\"string\"}\n",
        "\n",
        "if HF_TOKEN:\n",
        "  os.environ[\"HF_TOKEN\"] = HF_TOKEN # Set the \"HF_TOKEN\" environment variable\n",
        "  print(\"HF_TOKEN environment variable configured.\")  # Helpful feedback\n",
        "else:\n",
        "  print(\"No HF_TOKEN provided, skipping environment variable configuration.\")  # Good to know why\n",
        "\n",
        "# @markdown ### TTS Models\n",
        "# @markdown ChatTTS: [GitHub](https://github.com/2noise/ChatTTS) - A generative speech model for daily dialogue.\n",
        "download_chattts = True  # @param {\"type\":\"boolean\", \"placeholder\":\"Download ChatTTS Model\"}\n",
        "# @markdown F5-TTS: [GitHub](https://github.com/SWivid/F5-TTS) - A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching\n",
        "download_f5_tts = False  # @param {\"type\":\"boolean\"}\n",
        "# @markdown CosyVoice: [GitHub](https://github.com/FunAudioLLM/CosyVoice) - Multi-lingual large voice generation model, providing inference, training and deployment full-stack ability.\n",
        "download_cosyvoice = False  # @param {\"type\":\"boolean\"}\n",
        "# @markdown FireRedTTS: [GitHub](https://github.com/FireRedTeam/FireRedTTS) - An Open-Sourced LLM-empowered Foundation TTS System\n",
        "download_fire_red_tts = False  # @param {\"type\":\"boolean\"}\n",
        "# @markdown FishSpeech: [GitHub](https://github.com/fishaudio/fish-speech) - Brand new TTS solution\n",
        "download_fish_speech = False  # @param {\"type\":\"boolean\"}\n",
        "\n",
        "# @markdown ### ASR Model\n",
        "# @markdown Whisper: [GitHub](https://github.com/openai/whisper) - Robust Speech Recognition via Large-Scale Weak Supervision\n",
        "download_whisper = True  # @param {\"type\":\"boolean\"}\n",
        "\n",
        "# @markdown ### Voice Cloning Model\n",
        "# @markdown OpenVoice: [GitHub](https://github.com/myshell-ai/OpenVoice) - Instant voice cloning by MIT and MyShell.\n",
        "download_open_voice = False  # @param {\"type\":\"boolean\"}\n",
        "\n",
        "# @markdown ### Enhancement Model\n",
        "# @markdown resemble-enhance: [GitHub](https://github.com/resemble-ai/resemble-enhance) - AI powered speech denoising and enhancement\n",
        "download_enhancer = True  # @param {\"type\":\"boolean\"}\n",
        "\n",
        "\n",
        "# Check if at least one TTS model is selected\n",
        "if not any([download_chattts, download_fish_speech, download_cosyvoice, download_fire_red_tts, download_f5_tts]):\n",
        "    print(\"No TTS models selected, downloading ChatTTS by default...\")\n",
        "    download_chattts = True\n",
        "\n",
        "dl_source=\"huggingface\"\n",
        "\n",
        "# TTS Model Downloads\n",
        "if download_chattts:\n",
        "    print(\"Downloading ChatTTS...\")\n",
        "    dl_chattts.ChatTTSDownloader()(source=dl_source)\n",
        "    print(\"Downloading ChatTTS, completed\")\n",
        "\n",
        "if download_fish_speech:\n",
        "    print(\"Downloading FishSpeech...\")\n",
        "    fish_speech_1_4.FishSpeech14Downloader()(source=dl_source)\n",
        "    print(\"Downloading FishSpeech, completed\")\n",
        "\n",
        "if download_cosyvoice:\n",
        "    print(\"Downloading CosyVoice...\")\n",
        "    cosyvoice2.CosyVoice2Downloader()(source=dl_source)\n",
        "    print(\"Downloading CosyVoice, completed\")\n",
        "\n",
        "if download_fire_red_tts:\n",
        "    print(\"Downloading FireRedTTS...\")\n",
        "    fire_red_tts.FireRedTTSDownloader()(source=dl_source)\n",
        "    print(\"Downloading FireRedTTS, completed\")\n",
        "\n",
        "if download_f5_tts:\n",
        "    print(\"Downloading F5TTS...\")\n",
        "    f5_tts.F5TTSDownloader()(source=dl_source)\n",
        "    vocos_mel_24khz.VocosMel24khzDownloader()(source=dl_source)\n",
        "    print(\"Downloading F5TTS, completed\")\n",
        "\n",
        "# ASR Model Downloads\n",
        "if download_whisper:\n",
        "    print(\"Downloading Whisper...\")\n",
        "    faster_whisper.FasterWhisperDownloader()(source=dl_source)\n",
        "    print(\"Downloading Whisper, completed\")\n",
        "\n",
        "# Voice Cloning Model Downloads\n",
        "if download_open_voice:\n",
        "    print(\"Downloading OpenVoice...\")\n",
        "    open_voice.OpenVoiceDownloader()(source=dl_source)\n",
        "    print(\"Downloading OpenVoice, completed\")\n",
        "\n",
        "# Enhancement Model Downloads\n",
        "if download_enhancer:\n",
        "    print(\"Downloading ResembleEnhance...\")\n",
        "    dl_enhance.ResembleEnhanceDownloader()(source=dl_source)\n",
        "    print(\"Downloading ResembleEnhance, completed\")\n",
        "\n",
        "print(\"All selected models have been downloaded!\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "RtpfdFem8nOO"
      },
      "source": [
        "## Run WebUI"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "hi6eW_KLDc_p",
        "outputId": "5de77d0c-a3ee-4b86-87a8-490fdcdda794"
      },
      "outputs": [],
      "source": [
        "!nvcc --version"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "J5CkIs0nFZKi",
        "outputId": "b21554f2-e82b-422e-dc82-dfc675bc6c49"
      },
      "outputs": [],
      "source": [
        "!nvidia-smi"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "X8x_dWk88nOO",
        "outputId": "9029743a-d214-4385-a241-ef5de1f2c110"
      },
      "outputs": [],
      "source": [
        "!python webui.py --share --language=en"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
